@misc{amd2012GcnWhitepaper,
	title = {{AMD Graphics Cores Next (GCN) Architecture}},
	author = {{AMD, Inc.}},
	note = {http://www.amd.com/jp/Documents/GCN\_Architecture\_whitepaper.pdf},
	month = {June},
	year = {2012},
}

@misc{m2s_url,
	title = {{The Multi2Sim Simulation Framework: a CPU-GPU Model for Heterogeneous Computing}},
	note = {http://www.multi2sim.org/},
}

@misc{m2s_benchmarks_SI,
	title = {{OpenCL 2.5 Southern Islands}},
	note = {http://www.multi2sim.org/benchmarks/a\\*mdapp-2.5-si.html},
}

@misc{app-sdk,
	title = {{AMD Accelerated Parallel Processing (APP) Software Development Kit (SDK)}},
	note = {http://developer.amd.com/sdks/amdappsdk/}}

@inproceedings{WavefrontScheduling,
 author = {Rogers, Timothy G. and O'Connor, Mike and Aamodt, Tor M.},
 title = {Cache-Conscious Wavefront Scheduling},
 booktitle = {Proceedings of the 2012 45th Annual IEEE/ACM International Symposium on Microarchitecture},
 series = {MICRO '12},
 year = {2012},
 isbn = {978-0-7695-4924-8},
 pages = {72--83},
 numpages = {12},
 url = {http://dx.doi.org/10.1109/MICRO.2012.16},
 doi = {10.1109/MICRO.2012.16},
 acmid = {2457487},
 publisher = {IEEE Computer Society},
 address = {Washington, DC, USA},
}

@inproceedings{Ubal2012Multi2sim,
 author = {Ubal, Rafael and Jang, Byunghyun and Mistry, Perhaad and Schaa, Dana and Kaeli, David},
 title = {Multi2Sim: a simulation framework for CPU-GPU computing},
 booktitle = {Proceedings of the 21st international conference on Parallel architectures and compilation techniques},
 series = {PACT '12},
 year = {2012},
 isbn = {978-1-4503-1182-3},
 location = {Minneapolis, Minnesota, USA},
 pages = {335--344},
 numpages = {10},
 url = {http://0-doi.acm.org.umiss.lib.olemiss.edu/10.1145/2370816.2370865},
 doi = {10.1145/2370816.2370865},
 acmid = {2370865},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {CPU-GPU, heterogeneous computing, multi2sim, simulation},
}

@inproceedings{Choi2012ReducingOffchipTraffic,
 author = {Choi, Hyojin and Ahn, Jaewoo and Sung, Wonyong},
 title = {Reducing off-chip memory traffic by selective cache management scheme in GPGPUs},
 booktitle = {Proceedings of the 5th Annual Workshop on General Purpose Processing with Graphics Processing Units},
 series = {GPGPU-5},
 year = {2012},
 isbn = {978-1-4503-1233-2},
 location = {London, England},
 pages = {110--119},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/2159430.2159443},
 doi = {10.1145/2159430.2159443},
 acmid = {2159443},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPGPU, cache replacement decision, off-chip memory traffic},
}

@inproceedings{Jia2012DemandCache,
 author = {Jia, Wenhao and Shaw, Kelly A. and Martonosi, Margaret},
 title = {Characterizing and improving the use of demand-fetched caches in GPUs},
 booktitle = {Proceedings of the 26th ACM international conference on Supercomputing},
 series = {ICS '12},
 year = {2012},
 isbn = {978-1-4503-1316-2},
 location = {San Servolo Island, Venice, Italy},
 pages = {15--24},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/2304576.2304582},
 doi = {10.1145/2304576.2304582},
 acmid = {2304582},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {CUDA, GPGPU, GPU cache, compiler optimization},
}

@inproceedings{Fung2007DynamicWarpFormation,
 author = {Fung, Wilson W. L. and Sham, Ivan and Yuan, George and Aamodt, Tor M.},
 title = {Dynamic Warp Formation and Scheduling for Efficient GPU Control Flow},
 booktitle = {Proceedings of the 40th Annual IEEE/ACM International Symposium on Microarchitecture},
 series = {MICRO 40},
 year = {2007},
 isbn = {0-7695-3047-8},
 pages = {407--420},
 numpages = {14},
 url = {http://dx.doi.org/10.1109/MICRO.2007.12},
 doi = {10.1109/MICRO.2007.12},
 acmid = {1331735},
 publisher = {IEEE Computer Society},
 address = {Washington, DC, USA},
}

@ARTICLE{bjang2011AccessPattern,
 author={Byunghyun Jang and Schaa, D. and Mistry, P. and Kaeli, D.},
 journal={Parallel and Distributed Systems, IEEE Transactions on},
 title={Exploiting Memory Access Patterns to Improve Memory Performance in Data-Parallel Architectures},
 year = {2011},
 month= {Jan.},
 volume={22},
 number={1},
 pages={105-118},
 keywords={computer graphic equipment;coprocessors;multi-threading;parallel architectures;GPU;algorithmic memory selection;low-cost supercomputing;massive multithreaded data-parallel architectures;memory access patterns;memory subsystem;parallel computing;power budgets;scalar-based architectures;vector-based architectures;GPU computing;General-purpose computation on GPUs (GPGPUs);data parallelism;data-parallel architectures.;memory access pattern;memory coalescing;memory optimization;memory selection;vectorization},
 doi={10.1109/TPDS.2010.107},
 ISSN={1045-9219},
}

@inproceedings{Baghsorkhi2012EfficientPerfEval,
 author = {Baghsorkhi, Sara S. and Gelado, Isaac and Delahaye, Matthieu and Hwu, Wen-mei W.},
 title = {Efficient performance evaluation of memory hierarchy for highly multithreaded graphics processors},
 booktitle = {Proceedings of the 17th ACM SIGPLAN symposium on Principles and Practice of Parallel Programming},
 series = {PPoPP '12},
 year = {2012},
 isbn = {978-1-4503-1160-1},
 location = {New Orleans, Louisiana, USA},
 pages = {23--34},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/2145816.2145820},
 doi = {10.1145/2145816.2145820},
 acmid = {2145820},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {gpu, memory hierarchy, performance evaluation},
}

@book{jacob2008memory,
  title={Memory systems: cache, DRAM, disk},
  author={Jacob, Bruce and Ng, Spencer W and Wang, David T},
  year={2008},
  publisher={Morgan Kaufmann Pub}
}

@article{Gebhart2011EnergyEffMechanmisms,
 author = {Gebhart, Mark and Johnson, Daniel R. and Tarjan, David and Keckler, Stephen W. and Dally, William J. and Lindholm, Erik and Skadron, Kevin},
 title = {Energy-efficient mechanisms for managing thread context in throughput processors},
 journal = {SIGARCH Comput. Archit. News},
 issue_date = {June 2011},
 volume = {39},
 number = {3},
 month = jun,
 year = {2011},
 issn = {0163-5964},
 pages = {235--246},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/2024723.2000093},
 doi = {10.1145/2024723.2000093},
 acmid = {2000093},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {energy-efficiency, multi-threading, register file organization, throughput computing},
}

@misc{nvidia2012FermiWhitepaper,
	title = {{NVIDIA's Next Generation CUDA Compute Architecture: Fermi}},
	author = {{NVIDIA Corporation}},
	note = {http://www.nvidia.com/content/PDF/fermi\_\\white\_papers/NVIDIA\_Fermi\_Compute\_Architecture\_Whitepaper.pdf},
	year = {2009},
}

@INPROCEEDINGS{Roh1995storagehierarchy,
author={Roh, L. and Najjar, W.A.},
booktitle={Microarchitecture, 1995., Proceedings of the 28th Annual International Symposium on},
title={Design of storage hierarchy in multithreaded architectures},
year={Nov-1 Dec},
pages={271-278},
keywords={computer architecture;file organisation;multiprocessing systems;storage management;inter-thread locality;large scale multiprocessors;latency;multithreaded architectures;multithreaded execution;nonblocking threads;remote memory accesses;storage hierarchy;storage model;storage system;von Neumann model execution;Computer architecture;Computer science;Costs;Delay;Laboratories;Large-scale systems;Multithreading;Registers;Switches;Yarn},
doi={10.1109/MICRO.1995.476836},
ISSN={1072-4451},
}

@inproceedings{Baskaran:2008:CFO,
 author = {Baskaran, Muthu Manikandan and Bondhugula, Uday and Krishnamoorthy, Sriram and Ramanujam, J. and Rountev, Atanas and Sadayappan, P.},
 title = {A compiler framework for optimization of affine loop nests for gpgpus},
 booktitle = {Proceedings of the 22nd annual international conference on Supercomputing},
 series = {ICS '08},
 year = {2008},
 isbn = {978-1-60558-158-3},
 location = {Island of Kos, Greece},
 pages = {225--234},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1375527.1375562},
 doi = {10.1145/1375527.1375562},
 acmid = {1375562},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPU, empirical tuning, memory access optimization, polyhedral model},
}

@article{Buck:2004:BGS,
 author = {Buck, Ian and Foley, Tim and Horn, Daniel and Sugerman, Jeremy and Fatahalian, Kayvon and Houston, Mike and Hanrahan, Pat},
 title = {Brook for GPUs: stream computing on graphics hardware},
 journal = {ACM Trans. Graph.},
 issue_date = {August 2004},
 volume = {23},
 number = {3},
 month = aug,
 year = {2004},
 issn = {0730-0301},
 pages = {777--786},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1015706.1015800},
 doi = {10.1145/1015706.1015800},
 acmid = {1015800},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {Data Parallel Computing, GPU Computing, Brook, Programmable Graphics Hardware, Stream Computing},
}

@inproceedings{Dally:2003:MSS,
 author = {Dally, William J. and Labonte, Francois and Das, Abhishek and Hanrahan, Patrick and Ahn, Jung-Ho and Gummaraju, Jayanth and Erez, Mattan and Jayasena, Nuwan and Buck, Ian and Knight, Timothy J. and Kapasi, Ujval J.},
 title = {Merrimac: Supercomputing with Streams},
 booktitle = {Proceedings of the 2003 ACM/IEEE conference on Supercomputing},
 series = {SC '03},
 year = {2003},
 isbn = {1-58113-695-1},
 location = {Phoenix, AZ, USA},
 pages = {35--},
 url = {http://doi.acm.org/10.1145/1048935.1050187},
 doi = {10.1145/1048935.1050187},
 acmid = {1050187},
 publisher = {ACM},
 address = {New York, NY, USA},
}

@ARTICLE{HiCUDA,
author={Han, T.D. and Abdelrahman, T.S.},
journal={Parallel and Distributed Systems, IEEE Transactions on},
title={hiCUDA: High-Level GPGPU Programming},
year={2011},
volume={22},
number={1},
pages={78-90},
keywords={computer graphic equipment;coprocessors;program compilers;C-like interface;CUDA programming;compute unified device architecture;graphics processing units;high-level GPGPU programming;high-level directive-based language;prototype compiler;sequential code;Application software;Computer architecture;Computer graphics;Computer interfaces;Memory management;Packaging;Pipelines;Program processors;Programming profession;Prototypes;CUDA;GPGPU;data-parallel programming;directive-based language;source-to-source compiler.},
doi={10.1109/TPDS.2010.62},
ISSN={1045-9219},}


@inproceedings{Hong:2009:AMG,
 author = {Hong, Sunpyo and Kim, Hyesoon},
 title = {An analytical model for a GPU architecture with memory-level and thread-level parallelism awareness},
 booktitle = {Proceedings of the 36th annual international symposium on Computer architecture},
 series = {ISCA '09},
 year = {2009},
 isbn = {978-1-60558-526-0},
 location = {Austin, TX, USA},
 pages = {152--163},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1555754.1555775},
 doi = {10.1145/1555754.1555775},
 acmid = {1555775},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPU architecture, analytical model, cuda, memory level parallelism, performance estimation, warp level parallelism},
}

@INPROCEEDINGS{Seo:2009:SMC,
author={Sangmin Seo and Jaejin Lee and Sura, Z.},
booktitle={High Performance Computer Architecture, 2009. HPCA 2009. IEEE 15th International Symposium on}, title={Design and implementation of software-managed caches for multicores with local memory},
year={2009},
pages={55-66},
keywords={cache storage;memory architecture;microprocessor chips;Cell BE processors;GPGPU;OpenMP;extended set-index cache;heterogeneous multicore architectures;instruction scheduling;software-managed caches;Computer architecture;Computer science;Delay;Hardware;Memory management;Multicore processing;Processor scheduling;Programming profession;Runtime;Sliding mode control},
doi={10.1109/HPCA.2009.4798237},
ISSN={1530-0897},}

@inproceedings{Sung:2010:DLT,
 author = {Sung, I-Jui and Stratton, John A. and Hwu, Wen-Mei W.},
 title = {Data layout transformation exploiting memory-level parallelism in structured grid many-core applications},
 booktitle = {Proceedings of the 19th international conference on Parallel architectures and compilation techniques},
 series = {PACT '10},
 year = {2010},
 isbn = {978-1-4503-0178-7},
 location = {Vienna, Austria},
 pages = {513--522},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1854273.1854336},
 doi = {10.1145/1854273.1854336},
 acmid = {1854336},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {GPU, data layout transformation, parallel programming},
}

@INPROCEEDINGS{torres:CUDA,
author={Torres, Y. and Gonzalez-Escribano, A. and Llanos, D.R.},
booktitle={High Performance Computing and Simulation (HPCS), 2011 International Conference on}, title={Understanding the impact of CUDA tuning techniques for Fermi},
year={2011},
pages={631-639},
keywords={cache storage;coprocessors;multiprocessing systems;CUDA experienced programmers;CUDA tuning techniques;Fermi architecture;GPU capabilities;NVIDIA CUDA program;encoding;global memory access patterns;global-memory access pattern;multiprocessors;threadblocks;transparent cache;Cache memory;Graphics processing unit;Hardware;Instruction sets;Kernel;Shape;Tuning;Fermi;GPU;code tuning;performance},
doi={10.1109/HPCSim.2011.5999886},}

@article{Udayakumaran:2006:DAS,
 author = {Udayakumaran, Sumesh and Dominguez, Angel and Barua, Rajeev},
 title = {Dynamic allocation for scratch-pad memory using compile-time decisions},
 journal = {ACM Trans. Embed. Comput. Syst.},
 issue_date = {May 2006},
 volume = {5},
 number = {2},
 month = may,
 year = {2006},
 issn = {1539-9087},
 pages = {472--511},
 numpages = {40},
 url = {http://doi.acm.org/10.1145/1151074.1151085},
 doi = {10.1145/1151074.1151085},
 acmid = {1151085},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {Memory allocation, compiler, embedded systems, scratch pad, software caching, software-managed cache},
}

@INPROCEEDINGS{wong:DEMYGPU,
author={Wong, H. and Papadopoulou, M.-M. and Sadooghi-Alvandi, M. and Moshovos, A.},
booktitle={Performance Analysis of Systems Software (ISPASS), 2010 IEEE International Symposium on}, title={Demystifying GPU microarchitecture through microbenchmarking},
year={2010},
pages={235-246},
keywords={computer graphics;coprocessors;GPU microarchitecture;Nvidia GT200 GPU;graphics processors;microbenchmarking;Clocks;Computer architecture;Delay;Hardware;Kernel;Microarchitecture;Performance analysis;Registers;Samarium;Yarn},
doi={10.1109/ISPASS.2010.5452013},}

@inproceedings{Yang:2010:GCM,
 author = {Yang, Yi and Xiang, Ping and Kong, Jingfei and Zhou, Huiyang},
 title = {A GPGPU compiler for memory optimization and parallelism management},
 booktitle = {Proceedings of the 2010 ACM SIGPLAN conference on Programming language design and implementation},
 series = {PLDI '10},
 year = {2010},
 isbn = {978-1-4503-0019-3},
 location = {Toronto, Ontario, Canada},
 pages = {86--97},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1806596.1806606},
 doi = {10.1145/1806596.1806606},
 acmid = {1806606},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {compiler, gpgpu},
}

@inproceedings{Zhang:2010:SGA,
 author = {Zhang, Eddy Z. and Jiang, Yunlian and Guo, Ziyu and Shen, Xipeng},
 title = {Streamlining GPU applications on the fly: thread divergence elimination through runtime thread-data remapping},
 booktitle = {Proceedings of the 24th ACM International Conference on Supercomputing},
 series = {ICS '10},
 year = {2010},
 isbn = {978-1-4503-0018-6},
 location = {Tsukuba, Ibaraki, Japan},
 pages = {115--126},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/1810085.1810104},
 doi = {10.1145/1810085.1810104},
 acmid = {1810104},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {CPU-GPU pipelining, GPGPU, data transformation, thread divergence, thread-data remapping},
}











@misc{ivybridge,
	title = {{Intel Ivy Bridge}},
	note = {http://ark.intel.com/products/codename/29902/Ivy-Bridge}
}

@misc{sandybridge,
	title = {{Intel Sandy Bridge}},
	note = {http://software.intel.com/en-us/articles/sandy-bridge/}
}

@misc{multi2sim,
	title = {{This citation was removed to preserve anonymity.}}
}

@inproceedings{yeh93,
	author = {Yeh, Tse-Yu and Patt, Yale N.},
	title = {{A Comparison of Dynamic Branch Predictors that Use two Levels of Branch History}},
	booktitle = {Proc. of the 20th Int'l Symposium on Computer architecture},
	year = {1993}
}

@inproceedings{yeh93rate,
	author = {Yeh, Tse-yu and Marr, Deborah T. and Patt, Yale N.},
	title = {{Increasing the Instruction Fetch Rate via Multiple Branch Prediction and a Branch Address Cache}},
	booktitle = {Proc. of the 7th ACM Conference on Supercomputing},
	year = {1993}
}

@inproceedings{rotenberg96,
	author = {E. Rotenberg and J. Smith and S. Bennett},
	title = {{Trace Cache: a Low Latency Approach to High Bandwidth Instruction Fetching}},
	booktitle = {Proc. of the 29th Int'l Symposium on Microarchitecture},
	year = {1996}
}

@inproceedings{mcpat,
	author = {S. Li and J. H. Ahn and R. D. Strong and J. B. Brockman and D. M. Tullsen and N. P. Jouppi},
	title =  {{McPAT: An Integrated Power, Area, and Timing Modeling Framework for Multicore and Manycore Architectures}},
	booktitle = {Proc. of the 42nd Int'l Symposium on Microarchitecture},
	month = {Dec.},
	year = {2009}
}

@techreport{cacti,
	author = {N. Muralimanohar and R. Balasubramonian and N. P. Jouppi},
	title = {{CACTI 6.0: A Tool to Model Large Caches}},
	institution = {School of Computing, University of Utah},
	year = {2007}
}

@book{qian07,
	author = {X. Qian and H. Huang and Z. Duan and J. Zhang and N. Yuan and Y. Zhou and H. Zhang and H. Cui and D. Fan},
	title = {{Optimized Register Renaming Scheme for Stack-Based x86 Operations}},
	booktitle = {Architecture of Computing Systems - ARCS 2007},
	series = {Lecture Notes in Computer Science},
	publisher = {Springer Berlin / Heidelberg},
	volume = {4415},
	year = {2007}
}

}


@misc{amd-gcn,
	title = {{AMD Graphics Core Next}},
	author = {{M. Houston and M. Mantor}},
	note = {http://developer.amd.com/afds/assets/presentations/2620\_final.pdf}}
}





@misc{khronos,
	title = {{OpenCL: The Open Standard for Parallel Programming of Heterogeneous Systems}},
	note = {www.khronos.org/opencl}
}

@Misc{oclspec,
      AUTHOR = "{Khronos Group}",
      TITLE = "{OpenCL 1.1 Specification}",
      MONTH = "Oct.",
      YEAR = "2010",
      url = "{http://www.khronos.org/opencl/}"
}

@misc{evergreen,
	title = {{AMD Evergreen Family Instruction Set Arch. (v1.0d)}},
	note = {http://developer.amd.com/sdks/amdappsdk/documentation/}
}

@misc{cal,
	title = {{AMD Compute Abstraction Layer Programming Guide}},
	note = {http://developer.amd.com/sdks/amdappsdk/documentation/}
}

@misc{ptx,
	title = {{NVIDIA PTX: Parallel Thread Execution ISA}},
	note = {http://developer.nvidia.com/cuda-downloads/}
}

@misc{amdil,
	title = {{AMD Intermediate Language (IL) Spec. (v2.0e)}},
	note = {http://developer.amd.com/sdks/amdappsdk/documentation/}
}

@misc{appguide,
	title = {{AMD Accelerated Parallel Processing OpenCL Programming Guide (v1.3c)}},
}

@inproceedings{zhang11,
	author = {Y. Zhang and D. Owens},
	title = {{A Quantitative Performance Analysis Model for GPU Architectures}},
	booktitle = {Proc. of the 17th Int'l Symposium on High Performance Computer Architecture},
	month = {Feb.},
	year = {2011}
}

@inproceedings{diamos10,
    author = {Diamos, Gregory and Kerr, Andrew and Yalamanchili, Sudhakar and Clark, Nathan},
    title = {{Ocelot: a Dynamic Optimization Framework for Bulk-Synchronous Applications in Heterogeneous Systems}},
    booktitle = {Proc. of the 19th Int'l Conference on Parallel Architectures and Compilation Techniques},
    month = {Sept.},
    year = {2010}
}

@inproceedings{lattner04,
    author = {Lattner, Chris and Adve, Vikram},
    title = {{LLVM: A Compilation Framework for Lifelong Program Analysis \& Transformation}},
    booktitle = {{Proceedings of the International Symposium on Code Generation and Optimization: Feedback-Directed and Runtime Optimization}},
    series = {CGO '04},
    year = {2004},
    isbn = {0-7695-2102-9},
    location = {Palo Alto, California},
    pages = {75--},
    url = {http://0-portal.acm.org.ilsprod.lib.neu.edu/citation.cfm?id=977395.977673},
    acmid = {977673},
    publisher = {IEEE Computer Society},
    address = {Washington, DC, USA},
}

@inproceedings{collange09,
	author = {S. Collange and M. Daumas and D. Defour and D. Parello},
	title = {{Barra: A Parallel Functional Simulator for GPGPU.}},
	booktitle = {Proc. of the 18th Int'l Symposium on Modeling, Analysis and Simulation of Computer and Telecommunication Systems (MASCOTS)},
	month = {Aug.},
	year = {2010}
}

@inproceedings{bakhoda09,
	author = {Bakhoda, A. and Yuan, G.L. and Fung, W.W.L. and Wong, H. and Aamodt, T.M.},
	title = {{Analyzing CUDA Workloads Using a Detailed GPU Simulator}},
	booktitle = {Proc. of the Int'l Symposium on Performance Analysis of Systems and Software (ISPASS)},
	year = {2009},
	month = {Apr.}
}

@misc{fusion,
	title = {{The AMD Fusion Family of APUs}},
	note = {http://fusion.amd.com/},
}

@misc{denver,
	title = {{The NVIDIA Denver Project}},
	note = {http://blogs.nvidia.com/}
}

@article{nvidia-occupancy,
  author={Nvidia},
  url={CUDA Occupancy Calculator http://developer.download.nvidia.com/compute/cuda/CUDA_Occupancy_calculator.xls}
}

@techreport{simplescalar,
	author = {D. C. Burger and T. M. Austin},
	title = {{The SimpleScalar Tool Set, Version 2.0}},
	number = {CS-TR-1997-1342},
	year = {1997}
}

@article{smtsim,
	author = {D. M. Tullsen},
	title = {{Simulation and Modeling of a Simultaneous Multithreading Processor}},
	journal = {22nd Annual Computer Measurement Group Conference},
	month = {Dec.},
	year = 1996
}

@article{msim,
	author =	{J. Sharkey},
	title =		{{M-Sim: A Flexible, Multithreaded Architectural Simulation Environment}},
	journal =	{Technical Report CS-TR-05-DP01, Department of Computer Science, State University of New York at Binghamton},
	year =		{2005}
}

@article{simics02,
	author = {P. S. Magnusson et. al},
	title = {{Simics: A Full System Simulation Platform}},
	journal = {IEEE Computer},
	volume = {35},
	number = {2},
	year = {2002}
}

@article{gems06,
	author = {M. R. Marty and B. Beckmann and L. Yen and A. R. Alameldeen and M. Xu and K. Moore},
	title = {{GEMS: Multifacet's General Execution-Driven Multiprocessor Simulator}},
	journal = {Proc. of the 33rd Int'l Symposium on Computer Architecture},
	month = {June},
	year = {2006}
}

@article{m5,
	author = {N. L. Binkert and E. G. Hallnor and S. K. Reinhardt},
	title = {{Network-Oriented Full-System Simulation Using M5}},
	journal = {6th Workshop on Computer Architecture Evaluation using Commercial Workloads (CAECW)},
	month = {Feb.},
	year = {2003}
}


@inproceedings{yuan2009complexity,
	title = {{Complexity Effective Memory Access Scheduling for Many-Core Accelerator Architectures}},
	author = {G. L. Yuan and A. Bakhoda, A. and T. M. Aamodt},
	booktitle = {42nd Int'l Symposium on Microarchitecture},
	month = {Dec.},
	year = {2009}
}


@inproceedings{fung2007dynamic,
	title = {{Dynamic Warp Formation and Scheduling for Efficient GPU Control Flow}},
	author = {W. W. L. Fung and I. Sham and G. Yuan and T. M. Aamodt},
	booktitle = {Proc. of the 40th Int'l Symposium on Microarchitecture},
	month = {Dec.},
	year={2007}
}

@inproceedings{Hong2009,
    title = {{An analytical model for a GPU architecture with memory-level and thread-level parallelism awareness}},
    address = {New York, New York, USA},
    author = {Hong, Sunpyo and Kim, Hyesoon},
    booktitle = {Proceedings of the 36th annual international symposium on Computer architecture - ISCA '09},
    doi = {10.1145/1555754.1555775},
    isbn = {9781605585260},
    pages = {152},
    publisher = {ACM Press},
    year = {2009}
}

@inproceedings{Zhang2011b,
    title = {{A quantitative performance analysis model for GPU architectures}},
    author = {Zhang, Yao and Owens, John D.},
    booktitle = {2011 IEEE 17th International Symposium on High Performance Computer Architecture},
    doi = {10.1109/HPCA.2011.5749745},
    isbn = {978-1-4244-9432-3},
    month = feb,
    pages = {382--393},
    publisher = {IEEE},
    year = {2011}
}


@inproceedings{Gebhart2011,
  address = {New York, New York, USA},
  author = {Gebhart, Mark and Johnson, Daniel R. and Tarjan, David and Keckler, Stephen W. and Dally, William J. and Lindholm, Erik and Skadron, Kevin},
  booktitle = {Proceeding of the 38th annual international symposium on Computer architecture - ISCA '11},
  doi = {10.1145/2000064.2000093},
  publisher = {ACM Press},
  title = {{Energy-efficient mechanisms for managing thread context in throughput processors}},
  url = {{http://portal.acm.org/citation.cfm?doid=2000064.2000093}},
  year = {2011}
}
